Introduction

Let us first start by installing and loading the packages we will use during this whole assignment.

i.p <- (\(x) is.null(install.packages(x, repos = "https://cran.rediris.es")))

# gsub on files
require(xfun)          || i.p("xfun")          && require(xfun)

# Tibbles and plots
require(tidyverse)     || i.p("tidyverse")     && require(tidyverse)
require(patchwork)     || i.p("patchwork")     && require(patchwork)

# Tsibbles and models for them
require(tsibble)       || i.p("tsibble")       && require(tsibble)
require(feasts)        || i.p("feasts")        && require(feasts)
require(fable)         || i.p("fable")         && require(fable)
require(fable.prophet) || i.p("fable.prophet") && require(fable.prophet)
require(fabletools)    || i.p("fabletools")    && require(fabletools)

# Machine Learning
require(modeltime)     || i.p("modeltime")     && require(modeltime)
require(timetk)        || i.p("timetk")        && require(timetk)
require(rsample)       || i.p("rsample")       && require(rsample)
require(parsnip)       || i.p("parsnip")       && require(parsnip)

# Plot theme
theme_set(theme_minimal())

# English for dates
Sys.setlocale(locale = "en_GB.UTF-8")

# Useful functions
most_freq <- (\(x) as.numeric(names(which.max(table(x)))))

zip <- function(...) {
    mapply(list, ..., SIMPLIFY = FALSE)
}

enumerate <- function(...) {
    zip(ix = seq_along(..1), ...)
}

# Plot functions
colors <- c("CO"    = "#CD3333",
            "C6H6"  = "#EE7621",
            "NOy"   = "#EEC900",
            "NO2"   = "#43CD80",
            "Temp"  = "#009ACD",
            "TempM" = "#8B008B")

pretty_ts  <- function(.data, .var, title, type = "histogram", lag_max = NULL) {
    norm <- sqrt(nrow(.data))
    p1   <- .data %>%
        autoplot(.data[[.var]], color = colors[.var]) +
            labs(title = title, x = "", y = "")
    p2   <- .data %>%
        ACF(.data[[.var]], lag_max = lag_max) %>% 
        ggplot(aes(x = lag, y = acf)) +
            geom_hline(aes(yintercept = 0)) +
            geom_segment(mapping = aes(xend = lag, yend = 0),
                         color = colors[.var], alpha = 0.75) +
            geom_hline(aes(yintercept =  1.96/norm),
                       linetype = 2, color = colors[.var]) +
            geom_hline(aes(yintercept = -1.96/norm),
                       linetype = 2, color = colors[.var]) +
            labs(title = "", x = "", y = "ACF")
    if (type == "histogram") {
        p3 <- .data %>% 
            ggplot(aes(x = .data[[.var]])) +
            geom_histogram(aes(y = after_stat(density)),
                           color = "white", fill = colors[.var], alpha = 0.25) +
            geom_density(lwd = 1, color = colors[.var]) +
            labs(title = "", x = "", y = "Density")
    } else {
        p3 <- .data %>%
            PACF(.data[[.var]], lag_max = lag_max) %>% 
            ggplot(aes(x = lag, y = pacf)) +
                geom_hline(aes(yintercept = 0)) +
                geom_segment(mapping = aes(xend = lag, yend = 0),
                             color = colors[.var], alpha = 0.75) +
                geom_hline(aes(yintercept =  1.96/norm),
                           linetype = 2, color = colors[.var]) +
                geom_hline(aes(yintercept = -1.96/norm),
                           linetype = 2, color = colors[.var]) +
                labs(title = "", x = "", y = "PACF")
    }
    
    print(p1 / (p2 | p3))
}

pretty_resid  <- function(.data, .var, title, lag_max = NULL) {
    norm <- sqrt(nrow(.data))
    p1   <- .data %>%
        autoplot(.data[[".resid"]], color = colors[.var]) +
            labs(title = title, x = "", y = "")
    p2   <- .data %>%
        ACF(.data[[".resid"]], lag_max = lag_max) %>% 
        ggplot(aes(x = lag, y = acf)) +
            geom_hline(aes(yintercept = 0)) +
            geom_segment(mapping = aes(xend = lag, yend = 0),
                         color = colors[.var], alpha = 0.75) +
            geom_hline(aes(yintercept =  1.96/norm),
                       linetype = 2, color = colors[.var]) +
            geom_hline(aes(yintercept = -1.96/norm),
                       linetype = 2, color = colors[.var]) +
            labs(title = "", x = "", y = "ACF")
    p3   <- .data %>% 
        ggplot(aes(x = .data[[".resid"]])) +
        geom_histogram(aes(y = after_stat(density)),
                       color = "white", fill = colors[.var], alpha = 0.25) +
        geom_density(lwd = 1, color = colors[.var]) +
        labs(title = "", x = "", y = "Density")
    
    print(p1 / (p2 | p3))
}

compare_box_cox <- function(.data, .var, title) {
    .data$bc <- box_cox_vec(.data[[.var]], silent = TRUE)
    stl_nm   <- stl(as.ts(.data)[, .var], 24)$time.series[, "trend"] %>% as_tibble()
    stl_bc   <- stl(as.ts(.data)[, "bc"], 24)$time.series[, "trend"] %>% as_tibble()
    stl_nm$index <- .data$Date
    stl_bc$index <- .data$Date
    stl_nm   <- stl_nm %>% as_tsibble(index = index)
    stl_bc   <- stl_bc %>% as_tsibble(index = index)
    p1 <- .data %>% 
        autoplot(.data[[.var]], color = colors[.var], alpha = 0.25) +
            autolayer(stl_nm, x, color = colors[.var]) +
            labs(title = title, subtitle = "No transformation applied", x = "", y = "")
    p2 <- .data %>% 
        autoplot(.data[["bc"]], color = colors[.var], alpha = 0.25) +
            autolayer(stl_bc, x, color = colors[.var]) +
            labs(title = "", subtitle = "Box-Cox transformed", x = "", y = "")
    
    print(p1 | p2)
}

This data records the temperature, as well as the concentration in air of different polluting agents at road level in an Italian city. The data is taken from UCI Machine Learning Repository.

Our intent with this data, which is hourly recorded during one year, is to forecast the concentration of those same gases for the next six months and use it to predict the effect on the temperature these agents may have.

Getting the data

The code below will fetch the data and extract it for us. It will also apply basic transformations to help the code parse the input by transforming the fields to the CSV standard.

download_url <- paste0("https://archive.ics.uci.edu/ml/",
                       "machine-learning-databases/00360/AirQualityUCI.zip")

if (!file.exists("AirQualityUCI.zip")) {
    download.file(download_url, "./AirQualityUCI.zip")
}
if (!file.exists("AirQualityUCI.csv")) {
    unzip("AirQualityUCI.zip")
    gsub_file("AirQualityUCI.csv", ",",  ".")
    gsub_file("AirQualityUCI.csv", ";;", "")
    gsub_file("AirQualityUCI.csv", ";",  ",")
}

data <- read_csv("AirQualityUCI.csv", show_col_types = FALSE)

Preprocessing

The first thing we need is to create a variable which will act as our temporal index for the time series. This will be comprised of the Date and Time variables which we will join and transform to match the “UTC+1” 1 2 format. We will also take the opportunity and incorporate into the pipeline the replacement of -200 as NA, just as specified in the webpage where the dataset was taken. These NA will later be inputted, but not yet. Finally, we will transform into a tsibble object for the future analysis and visualization.

data_ts <- data %>%
    add_column(Datetime = as.POSIXct(paste(data$Date, data$Time), 
                                     format = "%d/%m/%Y %H.%M.%S",
                                     tz = "GMT"), .before = 1) %>% 
    select(!c(Date, Time)) %>% 
    replace(. == -200, NA) %>% 
    as_tsibble(index = Datetime)

Now we select what are the time series we are interested in. We refer to the information of the different time series present in the data and pick those which are taken from “reference analyser” along with the temperature. These will be the time series we will focus for the rest of the assignment.

data_ts <- data_ts %>% 
    select(c("Datetime", "T", ends_with("(GT)")))

Now we look at the distribution of the missing values. We note that for the variable NMHC(GT) the huge majority of the data are in reality NAs. Hence we decide to remove the aforementioned variable and keep the rest which have a count of NAs which we may handle.

data_ts %>% is.na() %>% colSums()
## Datetime        T   CO(GT) NMHC(GT) C6H6(GT)  NOx(GT)  NO2(GT) 
##        0      366     1683     8443      366     1639     1642
data_ts <-  data_ts %>% select(!"NMHC(GT)")

Lastly, for convenience, we rename the columns to get rid of the (GT) which was used to indicate reference analyser variables.

colnames(data_ts)    <- gsub("[(]GT[)]", "", colnames(data_ts))
colnames(data_ts)[1] <- "Date"
colnames(data_ts)[2] <- "Temp"

Descriptive Analysis

Let us visualize all the considered variables first. For the temperature we appreciate the oscillatory behaviour expected from an annual period. If look at the ACF, we have a revealed daily seasonality that make sense from temperature measurements.

data_ts %>% pretty_ts("Temp", title = "Temperature [°C]")

For the CO concentrations we have a different picture. First, we can note significantly the presence of missing values in the time series plot. Given that these come in patches, it is easy to suppose that they may come from sensor shutdown at concrete time periods. It is also worth noting a decrease of CO concentration in summer. This is probably due to the absence of heating devices given the warmer weather in the analysed region.

Looking at the ACF we observe a semi-daily period, which may come from the fact that heating devices, one of the main CO emitters, are switched with a usually semi-daily period, namely during the morning and late afternoon. Nonetheless, its seasonality appears to be weaker than in the case of the temperature.

data_ts %>% pretty_ts("CO", title = "CO [mg/m³]")

Benzene (\(C_6H_6\)) emit channels are a subset of those for carbon monoxide. This is appreciated when looking at the plots which exhibit a very similar behaviour for the time series. However, in the ACF we note that the seasonality is now only daily and with a much smaller trend component. Our intuition is that the reduced emit channels are the cause of this difference.

data_ts %>% pretty_ts("C6H6", title = "Benzene [μg/m³]")

Nitrogen oxides are another family of polluting agents. Here we have two columns which refer to them: \(NO_x\) and \(NO_2\). The first incorporates the effect of all nitrogen oxides. It is thus interesting to get rid of the \(NO_2\) contribution in the \(NO_x\) column in order to decouple both variables. We do so by using the formula conversion between ppb and μg/m³, \[\begin{equation*} C(\mathrm{ppb}) = C(\mu\mathrm{g/m}^3) \cdot \frac{V(\mathrm{air})}{M(NO_2)} \;, \end{equation*}\] where \(M(NO_2)\) refers to the molecular mass of nitrogen dioxide, 46.006 g/mol, and \(V(\mathrm{air})\) is calculated using Gay-Lussac’s law considering approximating the air pressure as a constant and taking the volume to be \(24.45\) litres at \(25\) degree Celsius.

data_ts$NOy <- data_ts$NOx - 24.45/46.006*data_ts$NO2 * (273.25 + data_ts$Temp)/298.25

# Delete pathological cases
data_ts$NOy[data_ts$NOy < 0] <- NA

We have renamed this, now \(NO_2\) stripped, variable to \(NO_y\) for convenience. It is interesting to note the sudden increase that happens at the end of the summer which does not seem to disappear on the following year.

Its seasonality, present in the ACF, is both, semi-daily and daily, with the latter being more prominent. As an explanation, we may track car pollution, which operate on the same frequency, e.g. workers going and coming back from work by car or bus.

data_ts %>% pretty_ts("NOy", title = "Nitrogen Oxides [ppb]")

The \(NO_2\) exhibits a very different pattern, justifying the prior decoupling we made. For once, it does not appear to have sudden changes when in different year epochs like the rest of gases, with a concentration that, at first glance, resembles a kind of stationary signal.

Moreover, the seasonality appears to be strictly daily, with a very fast ACF decrease, which indicates absence of trend.

data_ts %>% pretty_ts("NO2", title = "Nitrogen Dioxide [μg/m³]")

Forecasting

In this section we will attempt to forecast both, the gases as well as the temperature assuming a relation between them. The target will be a 6-month forecast. First, we will need to find suitable models for each of the gases and forecast them into the future. Then, we will use the predictions to construct a model in which the temperature may be estimated with a time series model and the forecasts of all the gases. There, we expect to learn which of the gases have a bigger impact on the whether.

Polluting agents

Given that we are treating four time series, one for each polluting agent, we ought to have an unified and organised schema of the steps for the correct modelling and forecast. Let us detail the recipe we will attempt to follow for the following part:

  1. For each of the agents we will see if a transformation would be fit to help and normalise the distribution of the values. We can note that in general we have very right-skewed distributions which may benefit from a low-\(\lambda\) Box-Cox transformation.

  2. We will implement five models for each agent: an automatic ARIMA, a manual ARIMA, an automatic ETS, a Prophet model and an automatic TSLM with a various Fourier components which account for all possible periodicities. For the manual ARIMA we will proceed by differentiating to white-noise and interpret the ACF and PACF.

  3. Each of these models will be tested using 3-fold cross-validation. Using the RMSE we will extract which model performed better. Then, we will select the model which performed better on the majority of the folds.

  4. The selected model will be latter be used to forecast the next 6-month concentrations of the different polluting agents.

Before starting the process, let us divide the data set in a training, on which we will make decisions and apply cross-validation, and a test partition, to get an estimation of the performance we may expect. This split will be made giving the training set 5 times more data than to the test one. For the cross-validation we will consider 7 months as training part and 1 more for testing. Note that cross-validation splits are taken from the train partition. We also take here the chance to input all NA data as they make some of the models invalid. These are imputed with the value just before them.

# Input missing values
data_ts <- data_ts %>% fill(everything())

# Duplicate Temp variable for later convenience
data_ts <- data_ts %>% mutate(TempM = Temp)

# Train-test split and CV splits
traintest <- data_ts %>%
    as_tibble() %>% 
    time_series_split(date_var = Date, initial = "10 months", assess = "2 months")
cvsplits  <- training(traintest) %>% 
    time_series_cv(date_var    = Date,
                   initial     = "7 months",
                   assess      = "1 month",
                   skip        = "1 month",
                   slice_limit = 3,
                   cumulative  = FALSE)

# Convert to tsibble for convenience
train_ts <- training(traintest) %>% as_tsibble(index = Date)
test_ts  <- testing(traintest)  %>% as_tsibble(index = Date)

Carbon Monoxide

First of all, we are going to consider a Box-Cox transformation to get a more normal distribution of the values, reducing skewness and heteroskedasticity. By looking at the following plots it does indeed appear that this transformation helps in getting a time series with which we may work more easily.

train_ts %>% compare_box_cox("CO", "CO atmosferic concentration [mg/m³]")

Now we need to examine the time series to create an appropriate ARIMA model. For this purpose, we are going to apply a 24-hour differentiation, which we decide upon by looking at the next graph. Then, we check and see using unitroot tests that a normal difference is not needed for the time series to reach a white-noise behaviour.

train_ts %>%
    mutate(CO = box_cox_vec(CO, silent = TRUE)) %>% 
    pretty_ts("CO", "Box-Cox transformed CO", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_CO  = box_cox_vec(CO, silent = TRUE),
           dBC_CO = difference(BC_CO, 24)) %>% 
    features(dBC_CO, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(CO = difference(box_cox_vec(CO, silent = TRUE), 24)) %>% 
    pretty_ts("CO", "Diff_24 Box-Cox transformed CO", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(3,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 3
# Look at PACF_24 => P = 3
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$CO)
    models_CO <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(CO, lam) ~ pdq(3, 0, 0) + PDQ(3, 1, 0, period = 24)),
            auto_arima   = ARIMA(box_cox(CO, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(CO, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(CO, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(CO, lam) ~ trend() +
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
    
    print(paste("The best model for split", sp$ix, "is",
                models_CO$.model[which.min(models_CO$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is prophet"
## [1] "The best model for split 3 is prophet"

We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. We also visualize its performance on the test partition which captures the tendency and with the confidence intervals offers a good approximation on the range where real values are measured. It fails nevertheless to reproduce the correct oscillatory movement.

lam_CO   <- auto_lambda(train_ts$CO)
model_CO <- train_ts %>%
    model(prophet(box_cox(CO, lam_CO) ~ season(period = 24)))
model_CO %>%
    residuals() %>%
    pretty_resid("CO", title = "Residuals for Carbon Monoxide [mg/m³]")

train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(CO) +
    autolayer(forecast(model_CO, test_ts), CO, color = colors["CO"]) +
    autolayer(test_ts, CO, color = "#838B8B", alpha = 0.75) +
    labs(x = "", y = "Carbon Monoxide [mg/m³]")

Benzene

As with CO, we are going to consider a Box-Cox transformation for the same reasons. By looking at the following plots it does indeed appear that this transformation helps in getting a time series with which we may work more easily.

train_ts %>% compare_box_cox("C6H6", "Benzene atmosferic concentration [μg/m³]")

Looking at previous ACF, we are going to apply a 24-hour differentiation. Then, we check and see using unitroot tests that a normal difference is not needed for the time series to reach a white-noise behaviour.

train_ts %>%
    mutate(C6H6 = box_cox_vec(C6H6, silent = TRUE)) %>% 
    pretty_ts("C6H6", "Box-Cox transformed Benzene", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_C6H6  = box_cox_vec(C6H6, silent = TRUE),
           dBC_C6H6 = difference(BC_C6H6, 24)) %>% 
    features(dBC_C6H6, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(C6H6 = difference(box_cox_vec(C6H6, silent = TRUE), 24)) %>% 
    pretty_ts("C6H6", "Diff_24 Box-Cox transformed Benzene", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS Prophet and TSLM.

# Look at PACF    => p = 2
# Look at PACF_24 => P = 3
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$C6H6)
    models_C6H6 <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(C6H6, lam) ~ 1 + pdq(2, 0, 0) +
                                     PDQ(3, 1, 0, period = 24)),
            auto_arima   = ARIMA(box_cox(C6H6, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(C6H6, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(C6H6, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(C6H6, lam) ~ trend() +
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)

    print(paste("The best model for split", sp$ix, "is",
                models_C6H6$.model[which.min(models_C6H6$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is prophet"
## [1] "The best model for split 3 is prophet"

We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. As with CO, the forecast captures the tendency but not the correct oscillatory behaviour

lam_C6H6   <- auto_lambda(train_ts$C6H6)
model_C6H6 <- train_ts %>%
    model(prophet(box_cox(C6H6, lam_C6H6) ~ season(period = 24)))
model_CO %>%
    residuals() %>%
    pretty_resid("CO", title = "Residuals for Benzene [μg/m³]")

train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(C6H6) +
    autolayer(forecast(model_C6H6, test_ts), C6H6, color = colors["C6H6"]) +
    autolayer(test_ts, C6H6, color = "#838B8B", alpha = 0.75) +
    labs(x = "", y = "Benzene [μg/m³]")

Nitrogen Oxides

This may be the variable with more heteroskedasticity among the batch. Hence we Box-Cox transform in an attempt to at least mitigate some of the unevenness. In contrast with the previous gases, here the utility of the transformation is critical in getting a somewhat homoskedastic time series.

train_ts %>% compare_box_cox("NOy", "Nitrogen Oxides atmosferic concentration [ppb]")

If we plot the ACF of the transformed data, we see justification for the seasonal differentiation.

train_ts %>%
    mutate(NOy = box_cox_vec(NOy, silent = TRUE)) %>% 
    pretty_ts("NOy", "Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_NOy  = box_cox_vec(NOy, silent = TRUE),
           dBC_NOy = difference(BC_NOy, 24)) %>% 
    features(dBC_NOy, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(NOy = difference(box_cox_vec(NOy, silent = TRUE), 24)) %>% 
    pretty_ts("NOy", "Diff_24 Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(3,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 3
# Look at PACF_34 => P = 3
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$NOy)
    models_NOy <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(NOy, lam) ~ 1 + pdq(3, 0, 0) + PDQ(3, 1, 0, period = 24)),
            auto_arima   = ARIMA(box_cox(NOy, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(NOy, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(NOy, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(NOy, lam) ~ trend() +
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)

    print(paste("The best model for split", sp$ix, "is",
                models_NOy$.model[which.min(models_NOy$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is prophet"
## [1] "The best model for split 3 is prophet"

We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. On the performance in the test partition, the same conclusions extracted before apply here.

lam_NOy   <- auto_lambda(train_ts$NOy)
model_NOy <- train_ts %>%
    model(prophet(box_cox(NOy, lam_NOy) ~ season(period = 24)))
model_NOy %>%
    residuals() %>%
    pretty_resid("NOy", title = "Residuals for Nitrogen Oxides [ppb]")

train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(NOy) +
    autolayer(forecast(model_NOy, test_ts), NOy, color = colors["NOy"]) +
    autolayer(test_ts, NOy, color = "#838B8B", alpha = 0.75) +
    labs(x = "", y = "Nitrogen Oxides [ppb]")

Nitrogen Dioxide

Different to the rest of the polluting agents, this one does not seem to require of a transformation. However, this transformation will ensure the positivity of all predicted values in the future, which is desirable. Hence we apply it here also.

train_ts %>% compare_box_cox("NO2", "Nitrogen Dioxide atmosferic concentration [μg/m³]")

Looking at the ACF in the previous section, we note that there is a strong daily seasonality. Hence we differentiate with 24 lags.

If we plot the ACF of the transformed data, we see justification for the seasonal differentiation.

train_ts %>%
    mutate(NO2 = box_cox_vec(NO2, silent = TRUE)) %>% 
    pretty_ts("NO2", "Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_NO2  = box_cox_vec(NO2, silent = TRUE),
           dBC_NO2 = difference(BC_NO2, 24)) %>% 
    features(dBC_NO2, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(NO2 = difference(box_cox_vec(NO2, silent = TRUE), 24)) %>% 
    pretty_ts("NO2", "Diff_24 Box-Cox transformed Nitrogen Dioxide",
              type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(0,1,3)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 1
# Look at ACF_24  => Q = 3
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$NO2)
    models_NO2 <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(NO2, lam) ~ 1 + pdq(2, 0, 0) + PDQ(0, 1, 3, period = 24)),
            auto_arima   = ARIMA(box_cox(NO2, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(NO2, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(NO2, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(NO2, lam) ~ trend() + 
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)

    print(paste("The best model for split", sp$ix, "is",
                models_NO2$.model[which.min(models_NO2$RMSE)]))
}
## [1] "The best model for split 1 is manual_arima"
## [1] "The best model for split 2 is auto_arima"
## [1] "The best model for split 3 is auto_arima"

We see that the residuals follow a normal distribution and a white-noise like waveform. In contrast to previous series, here the ACF does resemble much closer the one expected from a white-noise signal. On the other hand, when comparing with the test partition, we see very wide confidence intervals which result in a much less precise model.

lam_NO2   <- auto_lambda(train_ts$NO2)
model_NO2 <- train_ts %>%
    model(ARIMA(box_cox(NO2, lam) ~ PDQ(period = 24)))
model_NO2 %>%
    residuals() %>%
    pretty_resid("NO2", title = "Residuals for Nitrogen Dioxide [μg/m³]")

train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(NO2) +
    autolayer(forecast(model_NO2, test_ts), NO2, color = colors["NO2"]) +
    autolayer(test_ts, NO2, color = "#838B8B", alpha = 0.75) +
    labs(x = "", y = "Nitrogen Dioxide [μg/m³]") +
    coord_cartesian(ylim = c(NA, 500))

We observe that for all the polluting agents the best model overall was Prophet except for NO2 where automatic ARIMA performed best. Hence, we will use it in the following section to forecast the concentration of the various gases in order to test their effect on the temperature. For now, let us move on to temperature and how a univariate and multivariate approach on its forecast may behave.

Temperature

Now we are tasked to find a good model to describe the temperature as a function of the different polluting agents. We will attempt this by first finding a suitable model for the temperature just as we did for the gases. Then, we will play with the number of predictors to include in the model. At last, we will consider the 6-month forecast as we did for pollutants.

Univariate model

Let us start by finding the model. We do not believe that a Box-Cox transformation is needed given the apparent homoskedasticity of the data. Now we examine the time series to create an appropriate ARIMA model. For this purpose, we are going to apply a 24-hour differentiation. Then, we check and see using unitroot tests that a normal difference is not needed.

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(dTemp = difference(Temp, 24)) %>% 
    features(dTemp, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>% 
    mutate(Temp = difference(Temp, 24)) %>% 
    pretty_ts("Temp", "Diff_24 Temperature", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(2,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 2
# Look at PACF_24 => P = 2
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)
    
    models_Temp <- trndat %>% 
        model(
            manual_arima = ARIMA(Temp ~ 1 + pdq(2, 0, 0) + PDQ(2, 1, 0, period = 24)),
            auto_arima   = ARIMA(Temp ~ PDQ(period = 24)),
            prophet      = prophet(Temp ~ season(period = 24)),
            auto_ets     = ETS(Temp ~ season(period = 24)),
            auto_tslm    = TSLM(Temp ~ trend() + fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
    
    print(paste("The best model for split", sp$ix, "is",
                models_Temp$.model[which.min(models_Temp$RMSE)]))
}
## [1] "The best model for split 1 is prophet"
## [1] "The best model for split 2 is auto_arima"
## [1] "The best model for split 3 is prophet"

Let us proceed as usual and look at residuals and performance in the test partition. Residuals exhibit the same behaviour as we noted with the polluting agents, with a normal distribution zero-centred but very high correlations as noted in the ACF. For the performance in the test partition, we note a very poor prediction which, although not bad for the first month, quickly starts falling below the actual recorded data. This is a sign that the model is not able to see the whole yearly seasonality, which is not too rare considering that we only have one full year of data, hence less in the train partition.

model_Temp <- train_ts %>%
    model(prophet(Temp ~ season(period = 24)))
model_Temp %>%
    residuals() %>%
    pretty_resid("Temp", title = "Residuals for Temperature")

train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(Temp) +
    autolayer(forecast(model_Temp, test_ts), Temp, colour = colors["Temp"]) +
    autolayer(test_ts, Temp, color = "#838B8B", alpha = 0.75) +
    labs(x = "", y = "Temperature [°C]")

Without further ado, let us move directly to the last point of this whole section, the multivariate forecast on the temperature.

Multivariate model

We are now going to consider adding the gases to the model of the temperature. For this, we will compare two models which work very well and with good stability when adding predictors 3: TSLM and VAR. Note that the VAR model is able to predict not only for the temperature but also for all the agents at once. Nonetheless, we are interested in the temperature as we expect the presence of one concrete agent not to have a big impact on another 4. Hence we will only compare the performance for the temperature.

Let us start by determining the TSLM to compare by engaging in a manual iterative process removing predictors which does not happen to be related with the temperature. We consider in first approximation all the polluting agents at the same moment and with one day retarded effect.

# All
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 3) + 
                       CO + C6H6 + NOy + NO2 +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NOy, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -12.46016  -2.11269  -0.02342   2.23941  12.77775 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              27.0125332  0.7104920  38.019  < 2e-16
## trend()                                  -0.0025845  0.0001873 -13.795  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.5552593  0.0596198 -42.859  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.9253156  0.0650771 -44.952  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     0.9931345  0.0565636  17.558  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.1763642  0.0600489  19.590  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -0.2672563  0.0557770  -4.792 1.69e-06
## fourier(period = "1 day", K = 3)S3_24     0.1400547  0.0563092   2.487  0.01290
## fourier(period = "1 year", K = 3)C1_8766 -2.8549810  0.4424449  -6.453 1.17e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.1663552  0.2476783 -32.972  < 2e-16
## fourier(period = "1 year", K = 3)C2_8766  3.1586629  0.1787405  17.672  < 2e-16
## fourier(period = "1 year", K = 3)S2_8766  2.0487480  0.1358172  15.085  < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0520145  0.0725616  -0.717  0.47350
## fourier(period = "1 year", K = 3)S3_8766  0.9252624  0.1092749   8.467  < 2e-16
## CO                                        0.1873384  0.0617437   3.034  0.00242
## C6H6                                      0.1506110  0.0100857  14.933  < 2e-16
## NOy                                      -0.0070573  0.0005080 -13.893  < 2e-16
## NO2                                       0.0001194  0.0018826   0.063  0.94943
## lag(CO, 24)                               0.1041761  0.0619981   1.680  0.09294
## lag(C6H6, 24)                             0.0503747  0.0101037   4.986 6.31e-07
## lag(NOy, 24)                             -0.0002720  0.0005087  -0.535  0.59285
## lag(NO2, 24)                             -0.0048802  0.0018770  -2.600  0.00934
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    *  
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766    
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO                                       ** 
## C6H6                                     ***
## NOy                                      ***
## NO2                                         
## lag(CO, 24)                              .  
## lag(C6H6, 24)                            ***
## lag(NOy, 24)                                
## lag(NO2, 24)                             ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.337 on 7298 degrees of freedom
## Multiple R-squared: 0.854,   Adjusted R-squared: 0.8536
## F-statistic:  2033 on 21 and 7298 DF, p-value: < 2.22e-16
# NO2 out
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 3) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NOy, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4566  -2.1120  -0.0238   2.2399  12.7771 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              27.0153962  0.7090081  38.103  < 2e-16
## trend()                                  -0.0025841  0.0001872 -13.802  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.5556732  0.0592576 -43.128  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.9260439  0.0640515 -45.683  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     0.9933250  0.0564799  17.587  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.1760777  0.0598747  19.642  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -0.2672110  0.0557686  -4.791 1.69e-06
## fourier(period = "1 day", K = 3)S3_24     0.1401956  0.0562616   2.492  0.01273
## fourier(period = "1 year", K = 3)C1_8766 -2.8555317  0.4423295  -6.456 1.15e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.1651412  0.2469207 -33.068  < 2e-16
## fourier(period = "1 year", K = 3)C2_8766  3.1589507  0.1786707  17.680  < 2e-16
## fourier(period = "1 year", K = 3)S2_8766  2.0491144  0.1356850  15.102  < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0522345  0.0724737  -0.721  0.47109
## fourier(period = "1 year", K = 3)S3_8766  0.9252629  0.1092674   8.468  < 2e-16
## CO                                        0.1887034  0.0578669   3.261  0.00112
## C6H6                                      0.1506516  0.0100647  14.968  < 2e-16
## NOy                                      -0.0070488  0.0004898 -14.392  < 2e-16
## lag(CO, 24)                               0.1037752  0.0616709   1.683  0.09247
## lag(C6H6, 24)                             0.0503104  0.0100521   5.005 5.72e-07
## lag(NOy, 24)                             -0.0002762  0.0005044  -0.548  0.58401
## lag(NO2, 24)                             -0.0048267  0.0016759  -2.880  0.00399
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    *  
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766    
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO                                       ** 
## C6H6                                     ***
## NOy                                      ***
## lag(CO, 24)                              .  
## lag(C6H6, 24)                            ***
## lag(NOy, 24)                                
## lag(NO2, 24)                             ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.337 on 7299 degrees of freedom
## Multiple R-squared: 0.854,   Adjusted R-squared: 0.8536
## F-statistic:  2135 on 20 and 7299 DF, p-value: < 2.22e-16
# lag NOy out
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 3) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -12.44237  -2.11714  -0.02053   2.24370  12.76143 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              27.0385857  0.7077082  38.206  < 2e-16
## trend()                                  -0.0025857  0.0001872 -13.813  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.5562688  0.0592447 -43.148  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.9315717  0.0632479 -46.350  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     0.9923391  0.0564485  17.580  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.1745608  0.0598077  19.639  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -0.2666500  0.0557565  -4.782 1.77e-06
## fourier(period = "1 day", K = 3)S3_24     0.1408785  0.0562450   2.505 0.012276
## fourier(period = "1 year", K = 3)C1_8766 -2.8664126  0.4418617  -6.487 9.32e-11
## fourier(period = "1 year", K = 3)S1_8766 -8.1570794  0.2464696 -33.096  < 2e-16
## fourier(period = "1 year", K = 3)C2_8766  3.1589713  0.1786621  17.681  < 2e-16
## fourier(period = "1 year", K = 3)S2_8766  2.0536349  0.1354271  15.164  < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0537652  0.0724163  -0.742 0.457841
## fourier(period = "1 year", K = 3)S3_8766  0.9252416  0.1092622   8.468  < 2e-16
## CO                                        0.1970763  0.0558073   3.531 0.000416
## C6H6                                      0.1512756  0.0099995  15.128  < 2e-16
## NOy                                      -0.0071642  0.0004420 -16.207  < 2e-16
## lag(CO, 24)                               0.0896344  0.0560005   1.601 0.109509
## lag(C6H6, 24)                             0.0486305  0.0095720   5.080 3.86e-07
## lag(NO2, 24)                             -0.0050431  0.0016286  -3.097 0.001965
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    *  
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766    
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO                                       ***
## C6H6                                     ***
## NOy                                      ***
## lag(CO, 24)                                 
## lag(C6H6, 24)                            ***
## lag(NO2, 24)                             ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.337 on 7300 degrees of freedom
## Multiple R-squared: 0.854,   Adjusted R-squared: 0.8536
## F-statistic:  2248 on 19 and 7300 DF, p-value: < 2.22e-16
# fourier K = 2
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -12.823705  -2.130351   0.007951   2.233559  13.167983 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               2.199e+01  3.737e-01  58.848  < 2e-16
## trend()                                  -1.230e-03  9.666e-05 -12.725  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.544e+00  5.949e-02 -42.753  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.910e+00  6.345e-02 -45.869  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     9.981e-01  5.671e-02  17.598  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.193e+00  6.004e-02  19.873  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -2.692e-01  5.602e-02  -4.805 1.58e-06
## fourier(period = "1 day", K = 3)S3_24     1.311e-01  5.650e-02   2.320 0.020346
## fourier(period = "1 year", K = 2)C1_8766 -6.031e+00  2.310e-01 -26.110  < 2e-16
## fourier(period = "1 year", K = 2)S1_8766 -6.442e+00  1.380e-01 -46.694  < 2e-16
## fourier(period = "1 year", K = 2)C2_8766  1.903e+00  9.930e-02  19.163  < 2e-16
## fourier(period = "1 year", K = 2)S2_8766  1.248e+00  8.569e-02  14.560  < 2e-16
## CO                                        2.109e-01  5.602e-02   3.764 0.000169
## C6H6                                      1.516e-01  1.005e-02  15.089  < 2e-16
## NOy                                      -7.147e-03  4.438e-04 -16.104  < 2e-16
## lag(CO, 24)                               1.172e-01  5.614e-02   2.087 0.036909
## lag(C6H6, 24)                             4.791e-02  9.617e-03   4.982 6.44e-07
## lag(NO2, 24)                             -5.156e-03  1.631e-03  -3.161 0.001578
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    *  
## fourier(period = "1 year", K = 2)C1_8766 ***
## fourier(period = "1 year", K = 2)S1_8766 ***
## fourier(period = "1 year", K = 2)C2_8766 ***
## fourier(period = "1 year", K = 2)S2_8766 ***
## CO                                       ***
## C6H6                                     ***
## NOy                                      ***
## lag(CO, 24)                              *  
## lag(C6H6, 24)                            ***
## lag(NO2, 24)                             ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.352 on 7302 degrees of freedom
## Multiple R-squared: 0.8526,  Adjusted R-squared: 0.8522
## F-statistic:  2484 on 17 and 7302 DF, p-value: < 2.22e-16

We end up with a model that includes the concentrations of Carbon Monoxide, Benzene, Nitrogen Oxides 5 and a one day retarded concentration of Carbon Monoxide, Benzene and Nitrogen Dioxide.

Now we use cross-validation as before to compare this model with the one that VAR may predict for us.

models_TempM <- c("VAR", "TSLM")
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)
    
    var_model  <- trndat %>% 
        model(VAR(vars(Temp, CO, C6H6, NOy, NO2) ~ AR(24) + AR(8766))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
    
    tslm_model <- trndat %>% 
        model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
    
    best_TempM <- which.min(c(filter(var_model, .response == "Temp")$RMSE,
                                    tslm_model$RMSE))
    print(paste("The best model for split", sp$ix, "is", models_TempM[best_TempM]))
}
## [1] "The best model for split 1 is TSLM"
## [1] "The best model for split 2 is TSLM"
## [1] "The best model for split 3 is VAR"

Now that we have checked that the best model for the temperature is the considered TSLM, we follow by looking at the residuals of the model, which again follow the exact same behaviour we observed for all previous series. The performance on the test partition is kind of similar to the one we observed when no predictors were considered. However, this time there is an slight bend upwards which indicates that the model has captured better the tendency. This gives us hopes for the future forecasts.

model_TempM <- train_ts %>%
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
model_TempM %>%
    residuals() %>%
    pretty_resid("TempM", title = "Residuals for Temperature (multi)")

train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(Temp) +
    autolayer(forecast(model_TempM, test_ts), TempM, colour = colors["TempM"]) +
    autolayer(test_ts, TempM, color = "#838B8B", alpha = 0.75) +
    labs(x = "", y = "Temperature (multi) [°C]")

Six month forecasts and conclusions

Finally we end all the work done in this section by using the selected models in a six-month forecast of both, the polluting agents and the two models for the temperature. We note that all of the predictions look reasonable in general terms, especially on the first few months of the forecast. Later, the confidence intervals either explode, giving us little to none information, or the forecasts reach unphysical results.

The latter observation is more prominent for the univariate temperature model. On the other hand, the multivariate model encapsulates the yearly seasonality and provides well reasonable and stable prediction intervals. Hence we claim that by accounting for the possible effects of the different polluting agents, we end with a very reasonable model for future temperature forecast.

# Box-Cox lambdas
lam_CO   <- auto_lambda(data_ts$CO)
lam_C6H6 <- auto_lambda(data_ts$C6H6)
lam_NOy  <- auto_lambda(data_ts$NOy)
lam_NO2  <- auto_lambda(data_ts$NO2)

# Forecast of univariate models
forecast_CO   <- data_ts %>% 
    model(prophet(box_cox(CO, lam_CO) ~ season(period = 24))) %>% 
    forecast(h = "6 months")
forecast_C6H6 <- data_ts %>% 
    model(prophet(box_cox(C6H6, lam_C6H6) ~ season(period = 24))) %>% 
    forecast(h = "6 months")
forecast_NOy  <- data_ts %>% 
    model(prophet(box_cox(NOy, lam_NOy) ~ season(period = 24))) %>% 
    forecast(h = "6 months")
forecast_NO2  <- data_ts %>% 
    model(ARIMA(box_cox(NO2, lam_NO2) ~ PDQ(period = 24))) %>% 
    forecast(h = "6 months")
forecast_Temp <- data_ts %>% 
    model(prophet(Temp ~ season(period = 24))) %>% 
    forecast(h = "6 months")

# Forecast of multivariate model
future_data <- new_data(data_ts, n = nrow(forecast_Temp)) %>%
    mutate(CO  = forecast_CO$.mean,  C6H6 = forecast_C6H6$.mean,
           NOy = forecast_NOy$.mean, NO2  = forecast_NO2$.mean)
forecast_TempM <- data_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = 24, K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24))) %>% 
    forecast(future_data)

# Plot all of them
p1 <- data_ts %>% 
    autoplot(CO) +
    autolayer(forecast_CO,    CO,    color = colors["CO"]) +
    labs(x = "", y = "Carbon Monoxide [mg/m³]")
p2 <- data_ts %>% 
    autoplot(C6H6) +
    autolayer(forecast_C6H6,  C6H6,  color = colors["C6H6"]) +
    labs(x = "", y = "Benzene [μg/m³]")
p3 <- data_ts %>% 
    autoplot(NOy) +
    autolayer(forecast_NOy,   NOy,   color = colors["NOy"]) +
    labs(x = "", y = "Nitrogen Oxides [ppb]") +
    coord_cartesian(ylim = c(NA, 1500))
p4 <- data_ts %>% 
    autoplot(NO2) +
    autolayer(forecast_NO2,   NO2,   color = colors["NO2"]) +
    labs(x = "", y = "Nitrogen Dioxide [μg/m³]") +
    coord_cartesian(ylim = c(0, 500))
p5 <- data_ts %>% 
    autoplot(Temp) +
    autolayer(forecast_Temp,  Temp,  color = colors["Temp"]) +
    labs(x = "", y = "Temperature [°C]")
p6 <- data_ts %>% 
    autoplot(TempM) +
    autolayer(forecast_TempM, TempM, color = colors["TempM"]) +
    labs(x = "", y = "Temperature (multi) [°C]")
p1 / p2

p3 / p4

p5 / p6


  1. Remember we are dealing with data coming from an Italian city.↩︎

  2. The reason why we use “UTC+1” and not “CET” is that the data does not account for seasonal time shift for energy saving so neither should we.↩︎

  3. Inner testing showed that Prophet, ETS and ARIMA tend to give null models frequently when adding predictors. Moreover, the two model considered here offer a very neat way of knowing whether the considered variables are indeed related with the temperature or, as in the case of VAR, provide a feature selection on in itself.↩︎

  4. This is not to mean that concentrations are not going to be correlated, which they are going to be, but rather that the high concentration of one agent should not produce an effect on the rest, as the relation should lie in both having the same emit channels.↩︎

  5. Remember that this variable excludes (to a first approximation) the contributions due to Nitrogen Dioxide.↩︎